/*********************************************************************************

         Description: Cleaning merged census-marriage records data. 

         This do-file takes couple-level census data merged to marriage 
		 records and cleans and prepares it for OLS analysis. Outcome variables 
		 of the wife and husband end with _wife and _husb respectively. 
	
*********************************************************************************/

clear all

** Load census data merged to marriage records (1900, 1910, 1930) **

use "$dir/Data/Original/MR_Census_linked.dta"


*************************
******* Clean Data ******
*************************

egen couple_id=group(id groom_name bride_name)

* Surnames
drop bride_surname groom_surname
rename bride_surname_clean surname_wife
rename groom_surname_clean surname_husb

gen surname = surname_husb

* State
rename state state_marr 
rename state_fips statefip_marr
rename statefip statefip_residence 
drop birthstate birthstate_fips 


* Migration (pre and post marriage)
gen mig_premarr_h = bpl_husb != statefip_marr
gen mig_premarr_w = bpl_wife != statefip_marr

gen mig_postmarr = statefip_marr != statefip_residence



/*CREATE COUNTY ID*/
egen county_id=group(statefip_residence countyicp)

* Enumeration district
egen enumdist=group(enumdist_husb year)
drop enumdist_husb enumdist_wife

* Enumeration district X Census Year
egen enumdistXyear=group(enumdist cen_yr county_id)


* Farm / Rural / Urban
replace farm = farm - 1
replace urban = urban - 1

label drop farm_lbl urban_lbl 


* City pop
gen citypop = citypop_husb
drop citypop_*
replace citypop = 1 if citypop == 0
gen ln_citypop = ln(citypop)


* Treatment variable (isonymous marriage)
gen isonymous = is_mar_max

* Occscore 
gen ln_occscore_husb = ln(occscore_husb )
gen ln_occscore_wife = ln(occscore_wife )

* Self-employment
gen selfemployed_h = .
replace selfemployed_h = 1 if classwkr_husb ==1
replace selfemployed_h = 0 if classwkr_husb ==2
gen selfemployed_w = .
replace selfemployed_w = 1 if classwkr_wife ==1
replace selfemployed_w = 0 if classwkr_wife ==2


* LFP
gen lfp_wife = occscore_wife  != 00
gen lfp_husb = occscore_husb  != 00


* Education score 
replace edscor50_husb = . if edscor50_husb > 900
replace edscor50_wife = . if edscor50_wife > 900

gen ln_edscore_husb = ln(edscor50_husb )
gen ln_edscore_wife = ln(edscor50_wife )



* BPL-mismatch (Husb and Wife born in different places)
gen bpl_differ = bpl_h != bpl_w 
gen bpl_match = bpl_h == bpl_w 


* Surname commonness 
egen surname_count = count(id), by(surname)
gen ln_surname_count = ln(surname_count)



* Family outcomes
gen multgen_high = multgen == 3
rename ncouples_husb ncouples 
replace ncouples = max(0,ncouples - 1)
rename famsize_husb famsize
rename nchlt5_husb nchlt5
drop ncouples_wife famsize_wife nchlt5_wife

** Add parents living in household
gen parents_husb = 0
replace parents_husb = 1 if poploc_husb > 0 & poploc_husb != .
replace parents_husb = 1 if momloc_husb > 0 & momloc_husb != .

gen parents_wife = 0
replace parents_wife = 1 if poploc_wife > 0 & poploc_wife != .
replace parents_wife = 1 if momloc_wife > 0 & momloc_wife != .

gen parents = parents_husb + parents_wife


** Age at marriage
gen marrage_husb = age_husb - (cen_yr - year_of_marriage)
gen marrage_wife = age_wife - (cen_yr - year_of_marriage)

gen marrage_wife_pre20 = marrage_wife < 20
gen marrage_wife_pre18 = marrage_wife < 18
gen marrage_wife_pre16 = marrage_wife < 16

gen marrage_husb_pre20 = marrage_husb < 20
gen marrage_husb_pre18 = marrage_husb < 18
gen marrage_husb_pre16 = marrage_husb < 16



** Age difference
gen age_diff = age_h - age_w
winsor2 age_diff , replace trim cuts(1 99) 



** Birthyear
replace birthyr_wife = . if birthyr_wife == 9999
replace birthyr_husb = . if birthyr_husb == 9999

gen yob_husb = birthyr_husb
gen yob_wife = birthyr_wife



** Disability
egen deafblind_h = rowmax(blind_husb deaf_husb)
recode deafblind_h (1 = 0)(2 = 1)

egen deafblind_w = rowmax(blind_wife deaf_wife)
recode deafblind_w (1 = 0)(2 = 1)

egen deafblind = rowmax(deafblind_h deafblind_w)


drop gqtype

gen gq_medical = .
replace gq_medical = 0 if gqtyped_husb != .
replace gq_medical = 1 if gqtyped_husb == 300
replace gq_medical = 1 if inrange(gqtyped_husb,450,469)


egen disabled = rowmax(deafblind gq_medical)

rename anyhmem_disable deafblind_hmem
replace deafblind_hmem = deafblind_hmem*1000

************************************
*** Sample selection (drop obs) ****
************************************

* Keep only US-born 
keep if bpl_h < 60 & bpl_w < 60

* Keep only ages 18-50
keep if age_husb >= 18 & age_husb <= 50 

* Drop extreme marriage ages
gen drop_obs = 0
foreach var in marrage_wife marrage_husb {
	qui summarize `var', d
	replace drop_obs = 1 if `var'<r(p1) | `var'>r(p99)
	}
drop if drop_obs == 1
drop drop_obs


compress

label var isonymous "Isonymous marriage"

label var ln_occscore_husb "lnOccscore M"
label var ln_occscore_wife "lnOccscore F"
label var prank_husb "Income percentile-rank (Husband)"
label var ln_inchat_husb "LIDO Income (Husband)"

label var ln_edscore_husb "lnEdscor M"
label var ln_edscore_wife "lnEdscor F"
label var lifetime_mig_husb "LifeMig M"
label var lifetime_mig_wife "LifeMig F"
label var selfemployed_h "Self-employed M"
label var selfemployed_w "Self-employed F"
label var lfp_husb "Working M"
label var lfp_wife "Working F"
label var bpl_match "Spouses born same state"
      

label var urban "Urban"
label var farm "Farm"
label var parents "Live w/ a parent"
label var marrage_wife "Marr Age F"
label var marrage_husb "Marr Age M"
label var age_diff "Age diff (MvF)"
             
label var deafblind_h "Deaf/blind M"
label var deafblind_w "Deaf/blind F"
label var deafblind "Deaf/blind (either)"
label var gq_medical "Medical institution"
label var disabled "Deaf/blind/institution"
label var deafblind_hmem "Deaf/Blind (Any HH member)"


label var marrage_wife_pre18 "MAge < 18 (F)"
label var marrage_wife_pre16 "MAge < 16 (F)"
label var marrage_husb_pre20 "MAge < 20 (M)"
label var marrage_husb_pre18 "MAge < 18 (M)"
			   
label var multgen_high "Multi-gen HH (3+)"
label var ncouples "Couples in unit (excluding self)"
label var famsize "Family size"
label var nchlt5 "Children under 5"

* Save data ready for analysis
save "$dir/Data/Final/MR_Census_linked_clean.dta", replace
